import os
import json
import torch
from openai import AzureOpenAI
from tqdm import tqdm
import sys
import re
import random
import time
import numpy as np
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer

parser = argparse.ArgumentParser()
parser.add_argument('--count_personas', action='store_true', help='Output the number of personas')
parser.add_argument('--start', type=int, default=0, help='Start index for dataset slicing')
parser.add_argument('--end', type=int, default=None, help='End index for dataset slicing (inclusive)')
parser.add_argument('--output_dir', type=str, default='results', help='Directory to save per-job JSON outputs')
parser.add_argument('--runs_list', type=str, default='10', help='Comma-separated list indicating how many times to repeat each persona prediction (default "10")')
parser.add_argument('--gmo', action='store_true', help='Run GMO CTR evaluation mode (ads)')
parser.add_argument('--dataset_dir', type=str, default='/path/to/ctr_dataset/', help='Directory containing *.jsonl files for GMO evaluation')
parser.add_argument('--limit', type=int, default=None, help='Total number of samples to evaluate across all dataset files (evenly sampled)')
parser.add_argument('--seed', type=int, default=42, help='Random seed to ensure consistent sampling across runs')
args = parser.parse_args() 

# After parsing args
output_dir = os.path.abspath(args.output_dir)
os.makedirs(output_dir, exist_ok=True)
args.output_dir = output_dir  # overwrite to absolute for consistency

# Deterministic seeding
random.seed(args.seed)
np.random.seed(args.seed)

persona_prompts = {
    "18-24_female": """You are a digital ad analyst who is also a woman in the 18-24 age group. You're deeply familiar with what resonates with your generation—emotional authenticity, aesthetic quality, bold individuality, and social relevance. You instinctively recognize what appeals to younger women: body positivity, mental health awareness, empowerment, humor, and cultural fluency (like meme literacy or TikTok trends).

You've been shown up to five similar ads with their CTR performance (from 0 to 100), and now you must predict how well a new ad will perform with women your age. Use your personal insight, generational awareness, and pattern recognition to evaluate it.

Return:
Reason: [In one sentence, why you think this ad will perform that way]
Answer: [0–100]""",

    "18-24_male": """You are a digital ad analyst who is also a man in the 18-24 age group. You understand the mindset of young men—seeking confidence, entertainment, edge, and relevance. You're fluent in gaming, influencer culture, memes, and the kind of humor or visual punch that lands with this group.

Given several example ads with performance scores, and a new ad to assess, you must judge how likely it is to capture attention and convert for men your age.

Return:
Reason: [In one sentence, why this ad will or won't work for your group]
Answer: [0–100]""",

    "25-34_female": """You are a digital ad analyst who is also a woman in the 25-34 age group. You understand the balance this age group seeks—between career ambitions, lifestyle goals, personal growth, and relationships. Ads that show aesthetic clarity, empowerment, self-care, and intelligent value propositions resonate well.

You're given several similar ads and their CTR percentiles, followed by a new one to evaluate. Use your cultural fluency and marketing insight to estimate performance among women in your age group.

Return:
Reason: [In one sentence, your age group's likely response and why]
Answer: [0–100]""",

    "25-34_male": """You are a digital ad analyst and a man aged 25–34. You know what appeals to this demographic—ads that are ambitious, tech-savvy, direct, a little edgy, and tied to lifestyle aspiration (fitness, career growth, travel, or finance). You are skeptical of fluff, and you appreciate clarity, wit, and efficiency.

Given past ads with known CTR and a new one, predict its success based on how well it aligns with male values in this age range.

Return:
Reason: [In one sentence, why you predict this level of engagement]
Answer: [0–100]""",

    "35-44_female": """You're a digital ad analyst and a woman aged 35–44. You understand the balance your generation strikes—juggling responsibilities, making informed decisions, but also seeking meaningful and joyful moments. Emotional intelligence, warmth, family, health, and practical luxury matter here.

Given example ads with performance data, evaluate a new ad's resonance with your peers.

Return:
Reason: [In one sentence, how well the ad speaks to this age group's values]
Answer: [0–100]""",

    "35-44_male": """You're a digital ad analyst and a man aged 35–44. You know your generation values trust, utility, and smart messaging. You're discerning, experienced, and don't fall for fluff. You appreciate ads that are well-crafted, respect your intelligence, and deliver real value—especially around career, family, and financial growth.

Given ad examples and a new target ad, predict how well it will land with men in your cohort.

Return:
Reason: [In one sentence, why this ad will or won't resonate]
Answer: [0–100]""",

    "45-54_female": """You're a digital ad analyst and a woman aged 45–54. You represent a generation that values trust, authenticity, and depth. You're less interested in trendiness and more in whether an ad respects your intelligence and lived experience. You care about wellness, family, quality, and emotional clarity.

Reviewing example ads and a new one, estimate how it will perform among women like you.

Return:
Reason: [In one sentence, your interpretation of how this ad fits generational taste]
Answer: [0–100]""",

    "45-54_male": """You're a digital ad analyst and a man aged 45–54. You've seen advertising evolve and know when something is thoughtful versus superficial. You appreciate ads that offer clarity, real-world value, and a touch of inspiration. Trust and practical appeal matter most.

Given ad performance examples and a new one, analyze how it will fare with men your age.

Return:
Reason: [In one sentence, your logic for the predicted score]
Answer: [0–100]""",

    "55+_female": """You are a seasoned digital ad analyst and a woman over 55. You don't just evaluate ads—you see them through decades of shifting media, values, and cultural narratives. You know your peers prefer messaging that's clear, emotionally resonant, warm, and respectful. Themes like wellness, family, community, and security matter deeply.

You're shown similar ad examples with CTRs, followed by a new one. Predict its performance based on emotional tone, clarity, and meaningfulness to older women.

Return:
Reason: [In one sentence, why your age group will or won't respond to this ad]
Answer: [0–100]""",

    "55+_male": """You are a digital ad analyst and a man aged 55 or older. You've experienced enough advertising to know when something is valuable versus manipulative. You favor sincerity, logic, and emotional grounding. You value health, legacy, family, security, and clarity.

Given example ads and a new one, analyze how it will perform for men your age.

Return:
Reason: [In one sentence, your thoughtful rationale]
Answer: [0–100]"""
}

# persona_prompts = {
#     "18-24_female": """You are a woman aged 18–24. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design. Websites that are cluttered, generic, or try-hard are less likely to appeal to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return:
# Reason: [Why this website does or doesn't appeal to you visually and emotionally]
# Answer: [0–10] ← You must include this score.""",

#     "25-34_male": """You are a man aged 25–34. You value strong, clear, and modern visuals. You're likely to appreciate websites that are bold but not messy—clean grids, high contrast, sharp fonts, and relevant content (fitness, tech, ambition, money).

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return:
# Reason: [Why this design works for you—or not]
# Answer: [0–10] ← You must include this score.""",

#     "35-44_female": """You are a woman aged 35–44. You're drawn to websites that are intentional, emotionally intelligent, and visually clean. Family, meaning, and beauty in simplicity appeal to you more than trend-driven clutter.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return:
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic]
# Answer: [0–10] ← You must include this score.""",

#     "45-54_male": """You are a man aged 45–54. You prefer websites that are easy to navigate, focused, and visually grounded. You're drawn to sites that reflect purpose and clarity over trend or flash.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on usability, structure, and message.

# Return:
# Reason: [What stood out to you—positively or negatively—in its design or layout]
# Answer: [0–10] ← You must include this score.""",

#     "55+_female": """You are a woman aged 55 or older. You appreciate websites that feel meaningful, visually calm, and easy to understand. Gentle color palettes, clear fonts, and emotionally warm content make a big difference.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design simplicity and emotional tone.

# Return:
# Reason: [How the design made you feel—comforted, confused, interested, or indifferent]
# Answer: [0–10] ← You must include this score.""",
# }

# persona_prompts = {
#     "18-24_female_v1": """You are a woman aged 18–24 with an annual income over $100K. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

#     "18-24_female_v2": """You are a woman aged 18–24 with an annual income under $30,000. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

#     "18-24_male_v1": """You are a man aged 18–24 with an annual income over $100K. You're used to fast-scroll content and visual punch—memes, Twitch, TikTok, YouTube. You like websites that grab attention fast and look slick or high-end.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on visuals, usability, and vibe.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Your opinion on whether the website looks cool, clean, boring, or chaotic in minimal words]""",

#     "18-24_male_v2": """You are a man aged 18–24 with an annual income under $30,000. You're used to fast-scroll content and visual punch—memes, Twitch, TikTok, YouTube. You like websites that grab attention fast and look slick or high-end.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on visuals, usability, and vibe.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Your opinion on whether the website looks cool, clean, boring, or chaotic in minimal words]""",

#     "25-34_female_v1": """You are a woman aged 25–34 with an annual income over $100K. You appreciate polished, aspirational websites that match your lifestyle—wellness, creativity, relationships, career. You value clarity, taste, and brand maturity.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design, clarity, aesthetics, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes it feel appealing, elegant, or uninviting in minimal words]""",

#     "25-34_female_v2": """You are a woman aged 25–34 with an annual income under $30,000. You appreciate polished, aspirational websites that match your lifestyle—wellness, creativity, relationships, career. You value clarity, taste, and brand maturity.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design, clarity, aesthetics, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes it feel appealing, elegant, or uninviting in minimal words]""",

#     "25-34_male_v1": """You are a man aged 25–34 with an annual income over $100K. You value websites that feel sharp, modern, and confident. Bold layouts, strong CTAs, high-quality visuals—especially in tech, money, or fitness—stand out to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this design works for you—or not in minimal words]""",

#     "25-34_male_v2": """You are a man aged 25–34 with an annual income under $30,000. You value websites that feel sharp, modern, and confident. Bold layouts, strong CTAs, high-quality visuals—especially in tech, money, or fitness—stand out to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this design works for you—or not in minimal words]""",

#     "35-44_female_v1": """You are a woman aged 35–44 with an annual income over $100K. You appreciate websites that are refined, emotionally intelligent, and thoughtfully curated. You value elegance and simplicity, especially when paired with a sense of emotional resonance and purpose.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic in minimal words]""",

#     "35-44_female_v2": """You are a woman aged 35–44 with an annual income under $30,000. You appreciate websites that are refined, emotionally intelligent, and thoughtfully curated. You value elegance and simplicity, especially when paired with a sense of emotional resonance and purpose.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic in minimal words]""",

#     "35-44_male_v1": """You are a man aged 35–44 with an annual income over $100K. You expect websites to be efficient, modern, and professionally designed. Visual clarity, smart layout, and purposeful messaging signal value to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on structure, relevance, and visual balance.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes this site feel appealing or forgettable in minimal words]""",

#     "35-44_male_v2": """You are a man aged 35–44 with an annual income under $30,000. You expect websites to be efficient, modern, and professionally designed. Visual clarity, smart layout, and purposeful messaging signal value to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on structure, relevance, and visual balance.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes this site feel appealing or forgettable in minimal words]""",
# }




# -----------------------------
# Azure OpenAI Configuration (GPT models)
# -----------------------------
api_version = "2024-02-15-preview"
config_dict = {
    'api_key': "YOUR_OPENAI_API_KEY",
    'api_version': api_version,
    'azure_endpoint': "https://your-azure-openai-endpoint/"
}

# -----------------------------
# Local LLMs (Qwen & Llama) setup
# -----------------------------

# Cache to avoid re-loading models multiple times
_LOCAL_MODEL_CACHE = {}


def _load_local_model(model_key: str):
    """Lazy-load and cache local HF models.

    Args:
        model_key: Either "qwen" or "llama".
    Returns:
        Tuple (model, tokenizer)
    """

    if model_key in _LOCAL_MODEL_CACHE:
        return _LOCAL_MODEL_CACHE[model_key]

    if model_key == "qwen":
        model_name = "Qwen/Qwen3-14B"
        add_no_think = True
        enable_thinking_flag = True  # pass enable_thinking=False below
    elif model_key == "llama":
        model_name = "meta-llama/Llama-3.3-70B-Instruct"
        add_no_think = False
        enable_thinking_flag = False
    else:
        raise ValueError(f"Unknown model_key {model_key}")

    # Robust loading with up to 5 retries (handles transient HF hub connectivity issues)
    for attempt in range(1, 6):
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype="auto",
                device_map="auto",
                load_in_4bit=True,
            )
            break  # success
        except OSError as e:
            if attempt == 5:
                # Give up after final attempt
                raise
            wait_time = 2 ** attempt  # exponential back-off: 2,4,8,16 seconds …
            print(
                f"[WARNING] Failed to load {model_name} (attempt {attempt}/5): {e}. Retrying in {wait_time}s…",
                file=sys.stderr,
            )
            time.sleep(wait_time)

    _LOCAL_MODEL_CACHE[model_key] = (model, tokenizer, add_no_think, enable_thinking_flag)
    return _LOCAL_MODEL_CACHE[model_key]


def verbalize_local(model_key: str, full_prompt: str) -> str:
    """Generate a completion using a local (HF) model – Qwen or Llama.

    full_prompt should already contain persona text and user query.
    """
    model, tokenizer, add_no_think, enable_thinking_flag = _load_local_model(model_key)

    # Build chat template compatible messages
    user_content = ("/no_think" if add_no_think else "") + full_prompt
    messages = [{"role": "user", "content": user_content}]

    apply_kwargs = {
        "messages": messages,
        "tokenize": True,
        "add_generation_prompt": True,
        "return_tensors": "pt",
    }

    # For Qwen we must explicitly disable "thinking" tokens
    if enable_thinking_flag:
        apply_kwargs["enable_thinking"] = False

    input_ids = tokenizer.apply_chat_template(**apply_kwargs)
    input_ids = input_ids.to(model.device)

    gen_kwargs = {
        "max_new_tokens": 1200,
        "temperature": 0.85,
        "do_sample": True,
        "use_cache": True,
    }

    # min_p is supported by the Qwen generation strategy
    if model_key == "qwen":
        gen_kwargs["min_p"] = 0.1

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, **gen_kwargs)

    response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
    return response.strip()


# -----------------------------
# LLM assignment per-persona
# -----------------------------

# Cycle through GPT → Qwen → Llama for each persona key as requested.
_persona_keys = list(persona_prompts.keys())
_llm_cycle = ["gpt", "qwen", "llama"]
PERSONA_TO_LLM = {
    key: _llm_cycle[idx % len(_llm_cycle)] for idx, key in enumerate(_persona_keys)
}

# --------------------------------------------------------------------------------

def create_persona_system_prompt(persona_specification):
    """Create a system prompt based on the agent's persona specification"""
    # (Function unused in CTR task)
    return ""


def get_json_data_generate(sys_prompt, user_prompt, images):
    """Legacy helper for GPT-4V – not used in CTR task."""
    return {}


def verbalize(prompt, sys_prompt, images):
    """Legacy vision GPT call – not used in CTR task."""
    return ""

# ----------------------------- GPT Helper -----------------------------

def _verbalize_direct_gpt(full_prompt: str) -> str:
    """Call Azure GPT model with a single user message (persona + ad prompt)."""
    client = AzureOpenAI(
        api_key=config_dict['api_key'],
        api_version=config_dict['api_version'],
        azure_endpoint=config_dict['azure_endpoint'],
    )
    messages = [{"role": "user", "content": full_prompt}]
    resp = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=350,
        temperature=0.85,
        n=1,
    )
    return resp.choices[0].message.content.strip()

# ---------------------------------------------------------------------------
# GMO record helpers (copied/adapted from gpt_ctr.py)
# ---------------------------------------------------------------------------

def _sample_gmo_records(dataset_dir: str, total_limit: int | None):
    """Randomly sample records from each *.jsonl file in `dataset_dir`.

    If `total_limit` is provided, samples are taken as `total_limit // n_files` per file.
    Otherwise, all records from each file are returned.
    """
    file_paths = [os.path.join(dataset_dir, fp) for fp in os.listdir(dataset_dir) if fp.endswith('.jsonl')]
    if not file_paths:
        print(f"[ERROR] No .jsonl files found in {dataset_dir}", file=sys.stderr)
        sys.exit(1)

    random.shuffle(file_paths)  # shuffle to avoid ordering bias
    records = []
    per_file = None
    if total_limit is not None and total_limit > 0:
        per_file = max(1, total_limit // len(file_paths))

    for fp in file_paths:
        with open(fp, 'r', encoding='utf-8') as f_in:
            lines = f_in.readlines()
        if per_file is not None:
            chosen = random.sample(lines, min(per_file, len(lines)))
        else:
            chosen = lines
        for ln in chosen:
            try:
                rec = json.loads(ln)
                rec['_source_file'] = os.path.basename(fp)
                records.append(rec)
            except json.JSONDecodeError:
                continue  # skip malformed lines

    # If we overshot the limit due to rounding, trim back down
    if total_limit is not None and len(records) > total_limit:
        records = random.sample(records, total_limit)
    return records


def _load_np_subset_prompts(path: str):
    """Return a set of prompt strings present in the prior NP results file."""
    if not os.path.isfile(path):
        print(f"[ERROR] NP subset file not found: {path}", file=sys.stderr)
        sys.exit(1)
    try:
        with open(path, 'r', encoding='utf-8') as f_in:
            data = json.load(f_in)
    except Exception as e:
        print(f"[ERROR] Failed to load NP subset file {path}: {e}", file=sys.stderr)
        sys.exit(1)
    return {rec.get('prompt', '') for rec in data if isinstance(rec, dict)}


# ---------------------------------------------------------------------------
# Pre-load local models before starting the main evaluation loop
# ---------------------------------------------------------------------------

def _preload_local_models():
    """Load Qwen and Llama models into cache if they are in the cycle."""
    llms_in_use = set(PERSONA_TO_LLM.values())
    if "qwen" in llms_in_use or "llama" in llms_in_use:
        print("[INFO] Pre-loading local text LLMs... This may take a few minutes.")
        if "qwen" in llms_in_use:
            _load_local_model("qwen")
        if "llama" in llms_in_use:
            _load_local_model("llama")
        print("[INFO] Local models loaded.")


def run_gmo_evaluation_multi(args):
    """Multi-LLM GMO CTR evaluation (mirrors gpt_ctr.py but cycles personas)."""

    records = _sample_gmo_records(args.dataset_dir, args.limit)

    # Apply slicing
    slice_start = max(0, args.start)
    slice_end = args.end if args.end is not None else len(records) - 1
    slice_end = min(slice_end, len(records) - 1)
    records = records[slice_start : slice_end + 1]

    print(f"[INFO] Running GMO evaluation (multi-LLM) on {len(records)} records slice {slice_start}-{slice_end}.")

    for n_runs in runs_list:
        print("\n" + "=" * 80)
        print(f"Running evaluation with {n_runs} repetitions per datapoint…")
        print("=" * 80)

        out_name = (
            f"gmo_persona_results_runs{n_runs}_samples{args.limit or 'all'}_{slice_start}_{slice_end}.jsonl"
        )
        out_path = os.path.join(args.output_dir, out_name)

        # Remove previous run file if it exists
        if os.path.exists(out_path):
            os.remove(out_path)

        for rec in tqdm(records, desc=f"GMO Samples x{n_runs}"):
            ad_prompt = rec.get("prompt", "")
            ground_truth = rec.get("response")

            persona_data = {}
            persona_means = []

            for persona_name, persona_text in persona_prompts.items():
                llm_type = PERSONA_TO_LLM.get(persona_name, "gpt")
                predictions = []
                responses = []

                for _ in range(n_runs):
                    full_prompt = f"{persona_text}\n\n{ad_prompt}"
                    try:
                        if llm_type == "gpt":
                            resp_text = _verbalize_direct_gpt(full_prompt)
                        else:
                            prefix = "/no_think" if llm_type == "qwen" else ""
                            resp_text = verbalize_local(llm_type, prefix + full_prompt)
                    except Exception as e:
                        print(f"[WARNING] LLM error for persona {persona_name}: {e}")
                        continue

                    num_match = re.search(r"(?i)answer[^0-9]{0,10}(\d{1,3}(?:\.\d+)?)", resp_text)
                    score = float(num_match.group(1)) if num_match else None
                    if score is not None:
                        score = max(0.0, min(100.0, score))
                        predictions.append(score)
                    responses.append(resp_text)

                mean_score = float(np.mean(predictions)) if predictions else None
                persona_means.append(mean_score if mean_score is not None else np.nan)
                persona_data[persona_name] = {
                    "predictions": predictions,
                    "mean_prediction": mean_score,
                    "responses": responses,
                }

            overall_mean = float(np.nanmean(persona_means)) if persona_means else None

            result_record = {
                "prompt": ad_prompt,
                "ground_truth": ground_truth,
                "personas": persona_data,
                "overall_mean_prediction": overall_mean,
                "source_file": rec.get("_source_file"),
            }

            # Incremental save to JSONL
            try:
                with open(out_path, "a", encoding="utf-8") as f_out:
                    f_out.write(json.dumps(result_record) + '\n')
            except Exception as e:
                print(f"[WARNING] Incremental save to JSONL failed: {e}")

        print(f"[INFO] GMO evaluation runs={n_runs} saved to {out_path}")


# ---------------------------------------------------------------------------
# Main execution block
# ---------------------------------------------------------------------------

# Parse runs_list into integer list
runs_list = [int(x) for x in args.runs_list.split(',') if x.strip()]
runs_list = [r for r in runs_list if r > 0]
if not runs_list:
    raise ValueError("--runs_list must contain at least one positive integer")


if __name__ == "__main__":
    if getattr(args, 'gmo', False):
        _preload_local_models()
        run_gmo_evaluation_multi(args)
    else:
        print("[ERROR] This script now only supports GMO CTR evaluation. Please run with the --gmo flag.", file=sys.stderr)
        sys.exit(1) 